## Set code chunk option defaults
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
## Load packages needed
library(tidyverse); library(glue)
library(viridis)
## Set theme for plots (only works when you load ggplot2, which can be done using library(ggplot2) OR with library(tidyverse))
theme_set(theme_bw())
scale_color_continuous <- function(...) scale_color_viridis_c(...)
scale_fill_continuous <- function(...) scale_fill_viridis_c(...)
scale_color_discrete <- function(...) scale_color_viridis_d(...)
scale_fill_discrete <- function(...) scale_fill_viridis_d(...)
## Import data and manipulate a bit
framingham <- read_csv("../../data/Framingham/framingham.csv") %>%
rename(gender = male) %>%
mutate(gender = if_else(gender == 1, 'male', 'female'))
About FHS:
Started in 1948 to investigate cardiovascular disease (CVD)
At the time, little was known about CVD, but death rates were increasing
Researchers recruited 5,209 men and women between the ages of 30 and 62
Subjects returned every two years for detailed medical history and physical examinations
In 1971, enrolled 5,124 new subjects - offspring of the original subjects!
In 2002, started enrolling third generation!
Side note: Beaver Dam Eye Study here in Wisconsin is very similar. They are now on their third generation of subjects as well.
Subset of FHS data (from kaggle). Data from 4240 subjects collected over a 10 year period. Outcome: risk factors for developing Coronary Heart Disease (CHD).
## Set options for DT
options(DT.options = list(scrollX = TRUE))
## Print data using DT::datatable
framingham %>%
DT::datatable()
cat_vars <- c('gender', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'TenYearCHD')
ord_vars <- 'education'
cont_vars <- setdiff(colnames(framingham), c(cat_vars, ord_vars))
In this data set we have 7 categorical variables, 1 ordinal variable, and 8 continuous variables. We previously looked at various ways of summarizing and describing this data set. Here, we will perform different tests, and calculate confidence intervals for different parameters. We will look for differences between those that developed CHD over the ten year period, and those who did not.
variables_and_types <- tibble(variable = c(cat_vars, ord_vars, cont_vars),
type = rep(c("categorical", "continuous"), times = c(length(cat_vars) + 1, length(cont_vars))))
for_analyses <- framingham %>%
gather(key = 'variable', value = 'value', -TenYearCHD) %>%
group_by(variable) %>%
nest(Data = c(TenYearCHD, value)) %>%
left_join(variables_and_types) %>%
mutate(Data = map(Data, function(x) filter(x, complete.cases(x))),
n_categories = map2_dbl(Data, type, function(x,y){
if(y == "categorical"){
x %>% pull(value) %>% unique() %>% length()
} else {
NA_real_
}
})) %>%
arrange(type, n_categories)
First, contingency tables for categorical variables:
contingency_tables <- for_analyses %>%
filter(type == "categorical") %>%
unnest(cols = Data) %>%
count(variable, TenYearCHD, value) %>%
group_by(variable, value) %>%
mutate(Totals = sum(n),
n = glue("{round(n/Totals, digits = 4)} (n = {n})"),
TenYearCHD = paste("TenYearCHD", TenYearCHD)) %>%
spread(TenYearCHD, n) %>%
ungroup() %>%
split(.$variable) %>%
map2(.x = ., .y = names(.),
#~print(.y))
~.x %>% rename(!!.y := value) %>% select(-variable))
contingency_tables %>%
pander::pander()
BPMeds:
| BPMeds | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 0 | 4063 | 0.8543 (n = 3471) | 0.1457 (n = 592) |
| 1 | 124 | 0.6694 (n = 83) | 0.3306 (n = 41) |
currentSmoker:
| currentSmoker | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 0 | 2145 | 0.855 (n = 1834) | 0.145 (n = 311) |
| 1 | 2095 | 0.8411 (n = 1762) | 0.1589 (n = 333) |
diabetes:
| diabetes | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 0 | 4131 | 0.8538 (n = 3527) | 0.1462 (n = 604) |
| 1 | 109 | 0.633 (n = 69) | 0.367 (n = 40) |
education:
| education | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 1 | 1720 | 0.8122 (n = 1397) | 0.1878 (n = 323) |
| 2 | 1253 | 0.8827 (n = 1106) | 0.1173 (n = 147) |
| 3 | 689 | 0.8723 (n = 601) | 0.1277 (n = 88) |
| 4 | 473 | 0.852 (n = 403) | 0.148 (n = 70) |
gender:
| gender | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| female | 2420 | 0.8756 (n = 2119) | 0.1244 (n = 301) |
| male | 1820 | 0.8115 (n = 1477) | 0.1885 (n = 343) |
prevalentHyp:
| prevalentHyp | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 0 | 2923 | 0.8909 (n = 2604) | 0.1091 (n = 319) |
| 1 | 1317 | 0.7532 (n = 992) | 0.2468 (n = 325) |
prevalentStroke:
| prevalentStroke | Totals | TenYearCHD 0 | TenYearCHD 1 |
|---|---|---|---|
| 0 | 4215 | 0.8498 (n = 3582) | 0.1502 (n = 633) |
| 1 | 25 | 0.56 (n = 14) | 0.44 (n = 11) |
Means, standard deviations, and number of observations for continuous variables:
means_and_sds <- for_analyses %>%
filter(type == "continuous") %>%
select(variable, Data) %>%
unnest(cols = Data) %>%
group_by(variable, TenYearCHD) %>%
summarise(s = round(sd(as.numeric(value)), digits = 4),
m = round(mean(as.numeric(value)), digits = 4),
n = n()) %>%
ungroup() %>%
mutate(TenYearCHD = paste("TenYearCHD =", TenYearCHD),
out = glue("{m} (SD = {s}; n = {n})")) %>%
select(variable, TenYearCHD, out) %>%
spread(TenYearCHD, out)
pander::pander(means_and_sds)
| variable | TenYearCHD = 0 | TenYearCHD = 1 |
|---|---|---|
| age | 48.7625 (SD = 8.414; n = 3596) | 54.146 (SD = 8.0057; n = 644) |
| BMI | 25.6717 (SD = 3.9834; n = 3587) | 26.5315 (SD = 4.5221; n = 634) |
| cigsPerDay | 8.7139 (SD = 11.6949; n = 3569) | 10.6293 (SD = 13.0066; n = 642) |
| diaBP | 82.1664 (SD = 11.3383; n = 3596) | 86.9814 (SD = 14.0269; n = 644) |
| glucose | 80.6793 (SD = 18.963; n = 3258) | 89.0084 (SD = 41.1408; n = 594) |
| heartRate | 75.7625 (SD = 11.9882; n = 3596) | 76.5303 (SD = 12.22; n = 643) |
| sysBP | 130.3373 (SD = 20.4504; n = 3596) | 143.6188 (SD = 26.6903; n = 644) |
| totChol | 235.1474 (SD = 43.7657; n = 3555) | 245.389 (SD = 48.0757; n = 635) |
For each of the variables, we can perform a test to see if there’s a “statistically significant difference” between the two groups (i.e. those that developed CHD, and those that did not). Here, we’ll perform a chi-square test for the categorical variables, and a two sample t-test for continuous variables.
chisq_results <- for_analyses %>%
filter(type == "categorical") %>%
mutate(test = map(Data, ~broom::tidy(chisq.test(x = .x$TenYearCHD, y = .x$value)))) %>%
unnest_wider(test) %>%
select(variable, statistic, p.value, method)
ttests <- for_analyses %>%
filter(type == "continuous") %>%
ungroup() %>%
select(variable, Data) %>%
mutate(test = map(Data, function(x){
sds <- x %>% group_by(TenYearCHD) %>% summarise(sd = sd(as.numeric(value))) %>% pull(sd)
broom::tidy(t.test(as.numeric(x$value) ~ x$TenYearCHD, var.equal = (max(sds)/min(sds) < 2)))
})) %>%
unnest_wider(test) %>%
mutate(estimate = estimate1 - estimate2) %>%
select(variable, estimate, statistic, p.value, conf.low, conf.high, method)
chisq_results %>%
mutate(p.value = if_else(p.value < 0.0001, "<0.0001", format(round(p.value, digits = 4), scientific = FALSE))) %>%
pander::pander()
| variable | statistic | p.value | method |
|---|---|---|---|
| gender | 32.62 | <0.0001 | Pearson’s Chi-squared test with Yates’ continuity correction |
| currentSmoker | 1.497 | 0.2211 | Pearson’s Chi-squared test with Yates’ continuity correction |
| BPMeds | 30.65 | <0.0001 | Pearson’s Chi-squared test with Yates’ continuity correction |
| prevalentStroke | 14.03 | 0.0002 | Pearson’s Chi-squared test with Yates’ continuity correction |
| prevalentHyp | 132.5 | <0.0001 | Pearson’s Chi-squared test with Yates’ continuity correction |
| diabetes | 38.48 | <0.0001 | Pearson’s Chi-squared test with Yates’ continuity correction |
| education | 32.02 | <0.0001 | Pearson’s Chi-squared test |
ttests %>%
mutate_if(is.numeric, round, digits = 5) %>%
mutate(p.value = if_else(p.value < 0.0001, "<0.0001", as.character(p.value))) %>%
select(-contains("conf.")) %>%
pander::pander()
| variable | estimate | statistic | p.value | method |
|---|---|---|---|---|
| age | -5.383 | -15.06 | <0.0001 | Two Sample t-test |
| cigsPerDay | -1.915 | -3.753 | 0.00018 | Two Sample t-test |
| totChol | -10.24 | -5.349 | <0.0001 | Two Sample t-test |
| sysBP | -13.28 | -14.43 | <0.0001 | Two Sample t-test |
| diaBP | -4.815 | -9.548 | <0.0001 | Two Sample t-test |
| BMI | -0.8598 | -4.905 | <0.0001 | Two Sample t-test |
| heartRate | -0.7678 | -1.491 | 0.13592 | Two Sample t-test |
| glucose | -8.329 | -4.841 | <0.0001 | Welch Two Sample t-test |
What does this tell us? Most variables seem to be different between the two groups. Given the variables collected, this might not be a huge surprise.
Let us refresh how to perform a chi-square test by testing if there’s a difference in the distribution of the educational level between the two groups.
To do so, first we (as always) write down our hypotheses: \(H_0: \text{no difference in distributions}\) and \(H_A: \text{some difference in distributions}\).
Next, we create the contingency table:
four_by_two <- for_analyses %>%
filter(variable == "education") %>%
unnest(cols = Data) %>%
janitor::tabyl(TenYearCHD, value) %>%
janitor::adorn_totals(where = c("row", "col")) %>%
rename(`TenYearCHD \\\\ Education` = TenYearCHD)
four_by_two %>%
pander::pander()
| TenYearCHD \ Education | 1 | 2 | 3 | 4 | Total |
|---|---|---|---|---|---|
| 0 | 1397 | 1106 | 601 | 403 | 3507 |
| 1 | 323 | 147 | 88 | 70 | 628 |
| Total | 1720 | 1253 | 689 | 473 | 4135 |
Now, the idea is that if there’s no difference in the distribution between the two groups, then the people that developed CHD and the people that don’t should be distributed across the educational levels the same as the overall sample. In other words, the 3507 in the first row should be distributed across the levels 1 through 4 in the same way as the 4135 in the Total row. Same holds for the 628 in the second row.
So, if the null hypothesis is true, we can find the expected number of people in each of the cells. To do so, we start by creating a duplicate table where we remove all the values in the cells.
empty_cells <- four_by_two %>%
mutate_at(vars(`1`:`4`), function(x) c("", "", x[3]))
empty_cells %>%
pander::pander()
| TenYearCHD \ Education | 1 | 2 | 3 | 4 | Total |
|---|---|---|---|---|---|
| 0 | 3507 | ||||
| 1 | 628 | ||||
| Total | 1720 | 1253 | 689 | 473 | 4135 |
We then begin to fill it out. To do so, we find the proportion of the total number of people in each of the columns, and then we distribute the two other row totals in a similar way:
expected_counts <- empty_cells %>%
mutate(`1` = Total*as.numeric(`1`)[!is.na(as.numeric(`1`))]/4135,
`2` = Total*as.numeric(`2`)[!is.na(as.numeric(`2`))]/4135,
`3` = Total*as.numeric(`3`)[!is.na(as.numeric(`3`))]/4135,
`4` = Total*as.numeric(`4`)[!is.na(as.numeric(`4`))]/4135)
expected_counts %>%
pander::pander()
| TenYearCHD \ Education | 1 | 2 | 3 | 4 | Total |
|---|---|---|---|---|---|
| 0 | 1459 | 1063 | 584.4 | 401.2 | 3507 |
| 1 | 261.2 | 190.3 | 104.6 | 71.84 | 628 |
| Total | 1720 | 1253 | 689 | 473 | 4135 |
exp_counts <- expected_counts %>% filter(row_number() < 3) %>% select(`1`:`4`)
obs_counts <- four_by_two %>% filter(row_number() < 3) %>% select(`1`:`4`)
diff_counts <- ((obs_counts-exp_counts)^2/obs_counts)
diff_counts %>%
as_tibble() %>%
mutate(`TenYearCHD \\\\ Education` = c(0,1)) %>%
select(`TenYearCHD \\\\ Education`, everything()) %>%
pander::pander()
| TenYearCHD \ Education | 1 | 2 | 3 | 4 |
|---|---|---|---|---|
| 0 | 2.732 | 1.695 | 0.4608 | 0.008369 |
| 1 | 11.82 | 12.75 | 3.147 | 0.04818 |
Finally, we find the observe value of the chi square test statistic by simply adding up add the difference. The observed value is 32.6598258. To find the p-value, we find the probability of observing something “more extreme”. Some slightly tedious math shows us that the chi-square test statistic follows a \(\chi^2_k\) distribution, where \(k = (\text{number of rows} - 1)\cdot(\text{number of columns} - 1)\) is the degrees of freedom.
Notice how in this case, “more extreme” means greater – always. This is because of the square in the test statistic. If the observations are further away from the expectation, then the test statistic gets bigger, and vice versa. At the same time, if the observations are closer to the expectation, the test statistic gets smaller, and vice versa. Therefore, the p-value is the probability of a \(\chi^2_3\) random variable being greater than 32.6598258. This is depicted below. The area is super small – around 3.810^{-7}, well below any significance level you might want to use.
chisq_dist <- data.frame(x = seq(0, 40, by = 0.01)) %>%
ggplot(aes(x = x, y = dchisq(x, df = 3))) +
geom_line(color = 'blue') +
geom_vline(xintercept = sum(diff_counts),
linetype = 'dashed',
color = 'red') +
scale_y_continuous("", expand = expand_scale(mult = c(0, 0.1), add = 0))
plotly::ggplotly(chisq_dist)
To calculate confidence intervals for the continuous variables is fairly easy. Granted, you have to calculate the pooled variance \(s_p^2 = \frac{(n_1 - 1)\cdot s_1^2 + (n_2 - 1)\cdot s_0^2}{n_1 + n_2 - 2}\), but once that is done, you simply find \(\bar{\text{diff}} \pm 1.96 \cdot s_p \cdot \sqrt{1/n_1 + 1/n_0}\). The results can be found below.
CI_BMI <- ttests %>%
filter(variable == 'BMI') %>%
select(conf.low, conf.high) %>%
mutate_all(round, digits = 4) %>%
paste(collapse = ", ")
ttests %>%
pander::pander(split.table = Inf)
| variable | estimate | statistic | p.value | conf.low | conf.high | method |
|---|---|---|---|---|---|---|
| age | -5.383 | -15.06 | 5.573e-50 | -6.084 | -4.683 | Two Sample t-test |
| cigsPerDay | -1.915 | -3.753 | 0.0001769 | -2.916 | -0.9149 | Two Sample t-test |
| totChol | -10.24 | -5.349 | 9.335e-08 | -14 | -6.488 | Two Sample t-test |
| sysBP | -13.28 | -14.43 | 4.217e-46 | -15.09 | -11.48 | Two Sample t-test |
| diaBP | -4.815 | -9.548 | 2.171e-21 | -5.804 | -3.826 | Two Sample t-test |
| BMI | -0.8598 | -4.905 | 9.696e-07 | -1.203 | -0.5161 | Two Sample t-test |
| heartRate | -0.7678 | -1.491 | 0.1359 | -1.777 | 0.2415 | Two Sample t-test |
| glucose | -8.329 | -4.841 | 1.618e-06 | -11.71 | -4.951 | Welch Two Sample t-test |
As discussed last week, confidence intervals provide so much more information than the results of the corresponding hypothesis tests. It provides us with a range of values that we are somewhat confident will contain the true value.
For example, we are \(95\%\) that the true mean difference in BMI between those that developed CHD and those that did not is between \([-1.2035, -0.5161]\). First of all, we notice that \(0\) is not in the interval. This tells us that the null hypothesis \(H_0: \mu_1 - \mu_0 = 0\) would be rejected if we used a significance level of \(0.05\). But it also gives us an idea of the range of the difference. For example, if it was well-known that a change in BMI doesn’t really matter until it’s a change of \(1.5\) or more, then this tells us that, yes, there is a “statistically significant difference”, but it might not matter in the real world. (The truth is, if you just increase the sample size enough, you will always reach a “statistically significant” result, even if in fact there’s no difference.)
So that’s for the continuous variables, but how do we handle the categorical variables? To do so, we have to change our question a bit.
Using the chi square test simply answers the question: is there a difference? This is not exactly a quantity we can calculate a confidence interval for… In general, when dealing with categorical variables, you would probably take a look at relative risks (RRs) or odds ratios (ORs). When the variable is binary, this is straight forward. If not, then you would pick a group to use as a baseline, and then compare the rest of the groups to that group.
Let’s consider the relative risks for the variables that are binary (i.e. all of them except education). To actually calculate confidence intervals for the relative risks, we need to take a detour. Unfortunately, the distribution of the relative risk is not super easy to find. Therefore we don’t really know what distribution to look at when looking for critical values. However, the log relative risk is actually (approximately) normal! So, we will find a confidence interval for the log relative risk first, then translate that into a confidence interval for the relative risk.
library(fmsb)
RRs <- for_analyses %>%
filter(type == "categorical",
variable != "education") %>%
unnest(cols = Data) %>%
count(variable, TenYearCHD, value) %>%
group_by(variable, TenYearCHD) %>%
mutate(value = case_when(value == "male" ~ 1,
value == "female" ~ 0,
TRUE ~ as.numeric(value))) %>%
ungroup() %>%
mutate(TenYearCHD = paste("TenYearCHD", TenYearCHD),
value = paste("Group", value)) %>%
spread(TenYearCHD, n) %>%
group_by(variable) %>%
nest() %>%
mutate(
my_rrs = map(data, function(x){
RR_est <- (x[2,3]/(x[2,2] + x[2,3]))/((x[1,3]/(x[1,2] + x[1,3])))
SD <- sqrt((x[2,2]/x[2,3])/(x[2,2] + x[2,3]) + (x[1,2]/x[1,3])/(x[1,2] + x[1,3]))
lower_CI <- log(RR_est[[1]]) - 1.96*SD[[1]]
upper_CI <- log(RR_est[[1]]) + 1.96*SD[[1]]
tibble(RR_est = RR_est[[1]],
logRR_est = log(RR_est[[1]]),
lower_CI_log = lower_CI,
upper_CI_log = upper_CI) %>%
return()
}))
RRs_tibble <- RRs %>%
select(variable, my_rrs) %>%
unnest_wider(col = my_rrs)
As an example of how to do these calculations, we’ll consider the BPMeds variable. First step is to calculate the contingency table:
BPMeds_table <- for_analyses %>%
filter(variable == "BPMeds") %>%
unnest(cols = Data) %>%
janitor::tabyl(value, TenYearCHD) %>%
janitor::adorn_totals("col") %>%
rename(`BPMeds \\\\ TenYearCHD` = value)
BPMeds_table_RR <- (41/(41+83))/(592/(592+3471))
log_CI <- round(log(BPMeds_table_RR) + c(-1,1)*1.96*sqrt(((124 - 41)/41)/(124) + ((4063 - 592)/592)/4063), digits = 4)
pander::pander(BPMeds_table)
| BPMeds \ TenYearCHD | 0 | 1 | Total |
|---|---|---|---|
| 0 | 3471 | 592 | 4063 |
| 1 | 83 | 41 | 124 |
The relative risk is estimated as \(\hat{RR} = \frac{\hat{p}_1}{\hat{p}_0}\), i.e. the proportion of people with the disease that were exposed divided by the proportion of people wit the disease that were not exposed. So, \(\hat{RR} = \frac{41/124}{592/4063} = 2.2693\).
Now, a \(95\%\) confidence interval for the log relative risk, we calculate \(\log(RR) \pm 1.96 \cdot \sqrt{\frac{(n_1 - x_1)/x_1}{n_1} + \frac{(n_0 - x_0)/x_0}{n_0}}\), where \(x_0, x_1\) are the number of people with the disease that were unexposed and exposed, respectively, and \(n_0, n_1\) the total number of people exposed and unexposed, respectively. I.e. the \(95%\) Confidence Interval is
\[\begin{align*} \log(RR) \pm 1.96 \cdot \sqrt{\frac{(n_1 - x_1)/x_1}{n_1} + \frac{(n_0 - x_0)/x_0}{n_0}} &= log(2.2692758) \pm 1.96 \cdot \sqrt{\frac{(124 - 83)/83}{124} + \frac{(4063 - 3471)/3471}{4063}} \\ &= 0.8195 \pm 1.96*0.063448 \\ &= [0.5582, 1.0807] \end{align*}\]
This in itself isn’t super useful – how do you interpret a the log relative risk? However, we can easily create a confidence interval for the relative risk using this: simply exponentiate both limits. So, a \(95\%\) confidence interval for the relative risk is \([\exp(0.5582), \exp(1.0807)] = [1.7475, 2.9467]\).
We can do this for all the binary variables. First, the confidence intervals for the log relative risks
RRs_tibble %>%
select(-RR_est) %>%
pander::pander()
| variable | logRR_est | lower_CI_log | upper_CI_log |
|---|---|---|---|
| BPMeds | 0.8195 | 0.5582 | 1.081 |
| currentSmoker | 0.09194 | -0.05042 | 0.2343 |
| diabetes | 0.9202 | 0.6629 | 1.178 |
| gender | 0.4156 | 0.2732 | 0.5579 |
| prevalentHyp | 0.8159 | 0.6758 | 0.956 |
| prevalentStroke | 1.075 | 0.6269 | 1.523 |
Then, the confidence intervals for the relative risks.
RRs_tibble %>%
select(-logRR_est) %>%
mutate_at(vars(contains("log")), exp) %>%
rename_at(vars(contains("log")), str_remove_all, "_log") %>%
pander::pander()
| variable | RR_est | lower_CI | upper_CI |
|---|---|---|---|
| BPMeds | 2.269 | 1.748 | 2.947 |
| currentSmoker | 1.096 | 0.9508 | 1.264 |
| diabetes | 2.51 | 1.94 | 3.246 |
| gender | 1.515 | 1.314 | 1.747 |
| prevalentHyp | 2.261 | 1.966 | 2.601 |
| prevalentStroke | 2.93 | 1.872 | 4.586 |
How does this relate to the results of the chi square tests?
Let’s take a look at all the test results.
all_test_results <- bind_rows(
ttests %>%
select(variable, p.value),
chisq_results %>%
select(variable, p.value)
)
all_test_results %>%
mutate(p.value = if_else(p.value < 0.0001, "<0.0001", format(round(p.value, digits = 4), scientific = F))) %>%
pander::pander()
| variable | p.value |
|---|---|
| age | <0.0001 |
| cigsPerDay | 0.0002 |
| totChol | <0.0001 |
| sysBP | <0.0001 |
| diaBP | <0.0001 |
| BMI | <0.0001 |
| heartRate | 0.1359 |
| glucose | <0.0001 |
| gender | <0.0001 |
| currentSmoker | 0.2211 |
| BPMeds | <0.0001 |
| prevalentStroke | 0.0002 |
| prevalentHyp | <0.0001 |
| diabetes | <0.0001 |
| education | <0.0001 |
That’s a total of 15 tests. Now, none of these are close to our typical cutoff of 0.05, so correcting for multiple tests probably won’t change much. But let’s do it anyway.
Bonferroni corrected p-values are found by multiplying the p-value by the number of tests, here 15.
The Benjamini-Hochberg procedure works a bit differently. First, you rank your p-values from smallest to largest. Then, you calculate a corrected p-value as the p-value multiplied by the number of test over the rank. We then find the largest rank where the corrected p-value is smaller than 0.05, and reject all hypotheses before that – regardless of the values of the p-values. Here, \(k=13\) is the largest ranked p-value with corrected p-value less than 0.05, so we reject all hypotheses listed above that in the table below.
all_test_results %>%
arrange(p.value) %>%
mutate(k = row_number(),
multiplier = 15/k,
corrected_p_value = p.value*multiplier) %>%
pander::pander()
| variable | p.value | k | multiplier | corrected_p_value |
|---|---|---|---|---|
| age | 5.573e-50 | 1 | 15 | 8.36e-49 |
| sysBP | 4.217e-46 | 2 | 7.5 | 3.163e-45 |
| prevalentHyp | 1.189e-30 | 3 | 5 | 5.945e-30 |
| diaBP | 2.171e-21 | 4 | 3.75 | 8.14e-21 |
| diabetes | 5.525e-10 | 5 | 3 | 1.658e-09 |
| gender | 1.122e-08 | 6 | 2.5 | 2.804e-08 |
| BPMeds | 3.097e-08 | 7 | 2.143 | 6.636e-08 |
| totChol | 9.335e-08 | 8 | 1.875 | 1.75e-07 |
| education | 5.19e-07 | 9 | 1.667 | 8.651e-07 |
| BMI | 9.696e-07 | 10 | 1.5 | 1.454e-06 |
| glucose | 1.618e-06 | 11 | 1.364 | 2.207e-06 |
| cigsPerDay | 0.0001769 | 12 | 1.25 | 0.0002211 |
| prevalentStroke | 0.0001796 | 13 | 1.154 | 0.0002072 |
| heartRate | 0.1359 | 14 | 1.071 | 0.1456 |
| currentSmoker | 0.2211 | 15 | 1 | 0.2211 |
Here are all the p-values side by side. Notice how the Bonferroni method is way too conservative – some of the p-values are corrected from \(\approx 0.14\) and \(\approx 0.22\) to \(1\)!!
all_test_results %>%
arrange(p.value) %>%
mutate(p.value.bonf = p.adjust(p.value, method = "bonferroni"),
p.value.BH = p.adjust(p.value, method = "BH")) %>%
pander::pander()
| variable | p.value | p.value.bonf | p.value.BH |
|---|---|---|---|
| age | 5.573e-50 | 8.36e-49 | 8.36e-49 |
| sysBP | 4.217e-46 | 6.326e-45 | 3.163e-45 |
| prevalentHyp | 1.189e-30 | 1.783e-29 | 5.945e-30 |
| diaBP | 2.171e-21 | 3.256e-20 | 8.14e-21 |
| diabetes | 5.525e-10 | 8.288e-09 | 1.658e-09 |
| gender | 1.122e-08 | 1.682e-07 | 2.804e-08 |
| BPMeds | 3.097e-08 | 4.645e-07 | 6.636e-08 |
| totChol | 9.335e-08 | 1.4e-06 | 1.75e-07 |
| education | 5.19e-07 | 7.786e-06 | 8.651e-07 |
| BMI | 9.696e-07 | 1.454e-05 | 1.454e-06 |
| glucose | 1.618e-06 | 2.428e-05 | 2.207e-06 |
| cigsPerDay | 0.0001769 | 0.002654 | 0.0002072 |
| prevalentStroke | 0.0001796 | 0.002694 | 0.0002072 |
| heartRate | 0.1359 | 1 | 0.1456 |
| currentSmoker | 0.2211 | 1 | 0.2211 |
So, do we spot the difference we just found? For the continuous variables, let’s look at boxplots.
cont_boxplots <- framingham %>%
gather(key = "variable", value = "value", {cont_vars}) %>%
select(TenYearCHD, variable, value) %>%
mutate(TenYearCHD = as.character(TenYearCHD)) %>%
ggplot(aes(x = TenYearCHD, y = value, fill = TenYearCHD)) +
geom_boxplot(alpha = 0.5) +
facet_wrap(variable~., scales = "free", ncol = 4) +
scale_fill_discrete()
plotly::ggplotly(cont_boxplots)
Not quite. Why not?